library(pacman)
p_load(tidyverse, lubridate, ggplot2, plotly, viridis, gplots)

arrests <- read_delim(here::here('analyze', 'input', 'arrests_with_state.csv.gz'),
                      delim=',',
                      col_types = cols(arrest_date = col_character(),
                                       date_of_birth = col_factor(),
                                       gender = col_factor(),
                                       country_of_citizenship = col_factor(),
                                       event_number = col_factor(),
                                       apprehension_landmark = col_character(),
                                       arrest_method = col_factor(),
                                       most_serious_conviction = col_factor(),
                                       year = col_factor(),
                                       month = col_factor(),
                                       day = col_factor(),
                                       state = col_factor()))


# names(arrests) <- tolower(names(arrests))
# names(arrests) <- str_replace_all(names(arrests), "\\s", "_")

arrests$arrest_date <- mdy(arrests$arrest_date)
# arrests$year <- year(arrests$arrest_date)
# arrests$month <- month(arrests$arrest_date)
# arrests$day <- day(arrests$arrest_date)
arrests$wday <- as.factor(wday(arrests$arrest_date, label=TRUE, abbr=TRUE))

arrests <- arrests %>% 
  dplyr::select(-c(event_number, date_of_birth))

Arrest counts by state: absolute arrest totals higher in WA; both states show overall decline in arrests but note WA increase in 2017-2018 which is not mirrored in OR.

p1 <- arrests %>%
  filter(state %in% c("OR", "WA")) %>% 
  group_by(state,year) %>% 
  ggplot(aes(x = year, fill=state)) +
  geom_bar(stat='count') +
  geom_text(aes(label = after_stat(count)), stat = 'count', vjust=-1.5, color='black') +
  ylim(0, 5000) +
  facet_wrap(~state) +
  scale_fill_viridis_d() +
  theme_minimal()

p1

OR arrests by month; no notable seasonal trends:

p2 <- arrests %>% 
  filter(state == "OR") %>% 
  group_by(year, month) %>% 
  summarize(total_arrests = n()) %>% 
  ggplot(aes(x = month, y = total_arrests, color= year, group = year)) +
  geom_line() +
  ylim(0, NA) +
  scale_color_viridis_d() +
  theme_minimal()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
p2

OR arrests by weekday, note most arrests during weekdays.

p3 <- arrests %>% 
  filter(state == "OR") %>% 
  group_by(year, wday) %>%
  ggplot(aes(x = wday, fill=year, color=year, group = year)) +
  geom_bar() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  ylim(0, NA)

p3

Arrest method by state. Note OR shift from “CAP Local Incarceration” to “Non-Custodial Arrest” starting in 2015, not mirrored in WA.

p1 <- arrests %>% 
  filter(state %in% c("OR", "WA")) %>% 
  group_by(state, year, arrest_method) %>%
  ggplot(aes(x = year, fill= arrest_method, color=arrest_method)) +
  geom_bar(stat='count') +
  scale_fill_viridis_d(direction=-1) +
  scale_color_viridis_d(direction=-1) +
  ylim(0, NA) +
  facet_grid(~state)

ggplotly(p1)

Arrest methods in OR only as percent of total:

p1 <- arrests %>% 
  filter(state %in% c("OR")) %>% 
  mutate(arrest_method = as.factor(arrest_method)) %>% 
  group_by(state, year, arrest_method) %>%
  ggplot(aes(x = year, fill= arrest_method, color=arrest_method)) +
  geom_bar(stat='count', position='fill') +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_viridis_d(direction=-1) +
  scale_color_viridis_d(direction=-1) +
  ylim(0, NA)
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
ggplotly(p1)

Arrest method by day of week; not very interesting:

p1 <- arrests %>% 
  filter(state == "OR") %>% 
  mutate(arrest_method = as.factor(arrest_method)) %>% 
  group_by(arrest_method, wday) %>%
  ggplot(aes(x = wday, fill=arrest_method, color=arrest_method, group = arrest_method)) +
  geom_bar() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  ylim(0, NA)

p1

Country of citizenship

p1 <- arrests %>% 
  filter(state == "OR") %>%
  group_by(year, country_of_citizenship) %>% 
  summarize(total = n()) %>% 
  ggplot(aes(x = year, y = total, color=country_of_citizenship, group=country_of_citizenship)) +
  geom_line()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplotly(p1)

Note decreasing proportion of Mexican nationality, increase Guatemala and Honduras

top <- arrests %>% 
    filter(state == "OR") %>% 
    count(country_of_citizenship) %>% 
    arrange(desc(n)) %>% 
    head(5)
  
  p1 <- arrests %>%
    filter(state == "OR") %>% 
    mutate(country = case_when(
      country_of_citizenship %in% unlist(top$country_of_citizenship) ~ as.character(country_of_citizenship),
      TRUE ~ "ALL OTHERS")) %>%
    group_by(year, country) %>%
    ggplot(aes(x = year, color = country, fill=country, group=country)) +
    geom_bar(stat='count', position='fill')
  
  ggplotly(p1)